###############################
#### directories
###############################
directory_figures <- c("/Users/mariossi/Library/CloudStorage/Dropbox-Princeton/Papers/Proteomics_ciona/c.data_analysis/")
# Vector with stages in develpmental order
ciona_stages <- c("unfE", "fertE", "cell16", "iniG", "latN", "midTII", "latTII", "larva")
###############################
#### color palette
###############################
ciona_stages_palette <- c("#009392FF", "#39B185FF", "#9CCB86FF", "#E9E29CFF", "#EEB479FF", "#E88471FF", "#CF597EFF", "#D82526FF")
ciona_cluster_palette <- c("#88A0DCFF", "#381A61FF", "#7C4B73FF", "#ED968CFF", "#AB3329FF", "#E78429FF", "#F9D14AFF", "#7FA074FF") # > paletteer::paletteer_d("MetBrewer::Archambault") + MetBrewer::Cassatt2" - Colorblind-Friendly
isoform_palette <- c("#FBE426", "#36b39c", "#f3c300", "#875692", "#f38400", "#a1caf1", "#be0032", "#c2b280", "#848482", "#008856", "#e68fac", "#0067a5", "#f99379", "#604e97", "#f6a600", "#b3446c", "#dcd300", "#882d17", "#8db600", "#654522", "#e25822", "#2b3d26", "#AA0DFE", "#3283FE", "#85660D", "#782AB6", "#565656", "#1C8356", "#16FF32", "#F7E1A0", "#E2E2E2", "#1CBE4F", "#C4451C", "#DEA0FD", "#FE00FA", "#325A9B", "#FEAF16", "#F8A19F", "#90AD1C")
theme_personalised <- theme_pubr(base_family = "Avenir") +
theme(panel.background=element_blank(),
panel.border=element_blank(),
panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),
plot.background=element_blank())01.Ciona_prot_QC
1 Ciona proteome
1.0.1 Loading libraries
1.0.2 Functions & misc & loading data
1.1 QC MS data with LDA
Starting with checking the quality of MS
1.1.1 All fractions together
###############################
#### There are multiple fractions, first combine all
###############################
# forward = target
# reverse = decoy
# Files
files <- c("LDA_1696944876.csv", "LDA_1696944628.csv", "LDA_1696944697.csv", "LDA_1696944803.csv")
full_paths <- file.path(directory_figures, files)
# list of df
data_list <- map(full_paths, ~read_csv(.x, col_names = T))
# column names
col_names <- read_csv(full_paths[1], n_max = 0) %>% names()
data_list <- map(data_list, set_names, nm = col_names)
# Bind df together
LDA_data <- bind_rows(data_list) |>
clean_names() |>
dplyr::rename("delta_corr" = "number_916_corr")
LDA_data |> head()# A tibble: 6 × 17
scan_f scan_l z rank x_corr delta_corr uniq_number_916_corr number_ions number_ions_link reference reference_link peptide peptide_link pept_length gene_symbol annotation lda_score
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <dbl>
1 130 130 2 1 1.80 0.345 0.345 9/34 /gfy/www/modules//sdig/index.php?run_id=21229&search_id=28317&scans_id=21229&peptide_id=1&scanf=130&charge=2&value=9/34 ##KY21.Chr4.75.v1.nonSL1-1 /gfy/www/modules//consensus/?run_id=21229&search_id=28317&scans_id=21229&peptide_id=1&scanf=130&charge=2&value=##KY21.Chr4.75.v1.nonSL1-1 K.MFIGFVAMVWTQSFKFFK.R /gfy/www/modules/show_out/show_out_wrapper.php?run_id=21229&search_id=28317&scans_id=21229&peptide_id=1&scanf=130&charge=2&value=K.MFIGFVAMVWTQSFKFFK.R 18 <NA> #### 1.12
2 131 131 2 1 3.83 0.691 0.691 22/26 /gfy/www/modules//sdig/index.php?run_id=21229&search_id=28317&scans_id=21229&peptide_id=2&scanf=131&charge=2&value=22/26 KY21.Chr1.2009.v1.ND1-1 /gfy/www/modules//consensus/?run_id=21229&search_id=28317&scans_id=21229&peptide_id=2&scanf=131&charge=2&value=KY21.Chr1.2009.v1.ND1-1 K.PNLVDWLSDWSMEK.S /gfy/www/modules/show_out/show_out_wrapper.php?run_id=21229&search_id=28317&scans_id=21229&peptide_id=2&scanf=131&charge=2&value=K.PNLVDWLSDWSMEK.S 14 <NA> <NA> 2.06
3 132 132 2 1 0.363 0.235 0.235 2/26 /gfy/www/modules//sdig/index.php?run_id=21229&search_id=28317&scans_id=21229&peptide_id=3&scanf=132&charge=2&value=2/26 KY21.Chr5.42.v1.nonSL5-1 /gfy/www/modules//consensus/?run_id=21229&search_id=28317&scans_id=21229&peptide_id=3&scanf=132&charge=2&value=KY21.Chr5.42.v1.nonSL5-1 R.DFM*ETLNTLKYANR.A /gfy/www/modules/show_out/show_out_wrapper.php?run_id=21229&search_id=28317&scans_id=21229&peptide_id=3&scanf=132&charge=2&value=R.DFM*ETLNTLKYANR.A 14 <NA> <NA> -2.10
4 133 133 2 1 0.658 0.037 0.037 4/26 /gfy/www/modules//sdig/index.php?run_id=21229&search_id=28317&scans_id=21229&peptide_id=4&scanf=133&charge=2&value=4/26 ##KY21.Chr12.421.v1.ND1-1 /gfy/www/modules//consensus/?run_id=21229&search_id=28317&scans_id=21229&peptide_id=4&scanf=133&charge=2&value=##KY21.Chr12.421.v1.ND1-1 R.PQSYDIILVIDDLK.L /gfy/www/modules/show_out/show_out_wrapper.php?run_id=21229&search_id=28317&scans_id=21229&peptide_id=4&scanf=133&charge=2&value=R.PQSYDIILVIDDLK.L 14 <NA> #### -1.68
5 134 134 3 1 0.374 0.015 0.015 4/112 /gfy/www/modules//sdig/index.php?run_id=21229&search_id=28317&scans_id=21229&peptide_id=5&scanf=134&charge=3&value=4/112 KY21.Chr10.1078.v1.SL1-1 /gfy/www/modules//consensus/?run_id=21229&search_id=28317&scans_id=21229&peptide_id=5&scanf=134&charge=3&value=KY21.Chr10.1078.v1.SL1-1 K.LGSHYLVMVSLPFAEDVRPLSFESLFSGK.N /gfy/www/modules/show_out/show_out_wrapper.php?run_id=21229&search_id=28317&scans_id=21229&peptide_id=5&scanf=134&charge=3&value=K.LGSHYLVMVSLPFAEDVRPLSFESLFSGK.N 29 <NA> <NA> -1.64
6 135 135 2 1 0.222 0 0.04 1/28 /gfy/www/modules//sdig/index.php?run_id=21229&search_id=28317&scans_id=21229&peptide_id=6&scanf=135&charge=2&value=1/28 ##KY21.Chr4.577.v1.ND1-1 /gfy/www/modules//consensus/?run_id=21229&search_id=28317&scans_id=21229&peptide_id=6&scanf=135&charge=2&value=##KY21.Chr4.577.v1.ND1-1 R.LYKSLGM*ELQLTILR.H /gfy/www/modules/show_out/show_out_wrapper.php?run_id=21229&search_id=28317&scans_id=21229&peptide_id=6&scanf=135&charge=2&value=R.LYKSLGM*ELQLTILR.H 15 <NA> #### -1.72
# the $ in front of the protein are the reverse/decoy ones
###############################
#### plots
###############################
LDA_data |>
mutate(direction = if_else(str_detect(reference, "^##"), "reverse", "forward")) |>
filter(str_detect(reference, '_HUMAN_contaminant') &
x_corr > 1 ) |>
ggplot(aes(x=lda_score, fill=direction)) +
geom_histogram(binwidth=0.1, position="identity") +
scale_fill_manual(values = c("#BDE1C2", "#D20076FF")) +
scale_color_manual(values = c("#BDE1C2", "#D20076FF")) +
labs(x="LDA score", y="Count", fill="direction") +
theme_personalised +
LDA_data |>
mutate(direction = if_else(str_detect(reference, "^##"), "reverse", "forward")) |>
filter(str_detect(reference, '_HUMAN_contaminant') &
x_corr > 1 ) |>
ggplot(aes(x=lda_score, fill=direction,color=direction)) +
geom_freqpoly(binwidth = 0.3) +
scale_fill_manual(values = c("#BDE1C2", "#D20076FF")) +
scale_color_manual(values = c("#BDE1C2", "#D20076FF")) +
labs(x="LDA Score", y="Count", fill="direction") +
theme_personalised1.1.2 Example of a fraction
###############################
#### LDA fitlering at 1% FDR change for each fraction - plotting just one as an example
###############################
# Score Cutoff: 1.7402
lda <- as_tibble(
read_excel(path = paste0(directory_figures, "lda_single_fraction.xlsx"),
sheet = "Mariossi_1696882402",
# range = cell_cols("A:K"),
col_names=TRUE)) |>
clean_names()
# https://stackoverflow.com/questions/74778096/customize-the-position-of-geom-rug
limits <- range(lda$lda_score)
step <- diff(limits) * 0.1
size <- 0.45 * step
plot_lda <- lda %>%
mutate(
direction = if_else(str_detect(reference, "^##"), "Decoy", "Target"),
expression= if_else(direction == "Decoy", "5","16"),
lda_group = case_when(
lda_score > 1.7402 & direction == "Target" ~ "Target pass",
lda_score < 1.7402 & direction == "Target" ~ "Target fail",
TRUE ~ as.character(direction)
)
) %>%
mutate(lda_group = factor(lda_group, levels = c("Target fail", "Target pass", "Decoy"))) %>%
filter(x_corr > 1 & lda_score != 0) %>%
ggplot(aes(fill=lda_group)) +
geom_histogram(aes(x=lda_score, alpha=lda_group), color="black", binwidth=0.1, position="identity") +
geom_segment(
aes(
colour = lda_group,
x=lda_score,
xend = lda_score,
y = limits[1] - as.numeric(expression) * step + size*10,
yend = limits[1] - as.numeric(expression) * step - size*10
)
)+
scale_fill_manual(values = c(`Target fail` = "#BDE1C2", `Target pass` = "#00A087FF", Decoy = "#D20076FF")) +
scale_alpha_manual(values = c(`Target fail` = 0.1, `Target pass` = 1, Decoy = 1), guide="none") +
scale_color_manual(values = c(`Target fail` = "#BDE1C2", `Target pass` = "#00A087FF", Decoy = "#D20076FF"), guide="none") +
annotate("text", x = -1.2, y = 150, size=5,label = "XCorr > 1") +
labs(x="LDA score", y="Counts", fill="LDA Group") +
theme_personalised +
theme(legend.position=c(0.1, 0.83),
legend.background = element_blank(),
legend.title=element_blank()
)
plot_lda cairo_pdf("plot_lda.pdf", width=8.5, height=5.6)
print(plot_lda)
invisible(dev.off())1.2 Descriptive statistics
###############################
#### loading the data
###############################
ciona_prot.df <- as_tibble(read_delim(paste0(directory_figures, "ciona_proteome_normTo1.tsv")), col_names = T, trim_ws = T)
glimpse(ciona_prot.df)Rows: 7,095
Columns: 13
$ protein_id_transcript <chr> "KY21.Chr1.10.v1.nonSL2-1", "KY21.Chr1.1000.v1.SL1-1", "KY21.Chr1.1001.v1.SL1-1", "KY21.Chr1.1002.v1.SL1-1", "KY21.Chr1.1004.v1.nonSL4-1", "KY21.Chr1.1005.v1.ND1-1", "KY21.Chr1.1006.v1.SL1-1", "KY21.Chr1.1009.v1.ND1-1", "KY21.Chr1.101.v1.SL1-1", "KY21.Chr1.1010.v1.nonSL1-1", "KY21.Chr1.1011.v1.SL1-1", "KY21.Chr1.1012.v1.SL1-1", "KY21.Chr1.1013.v1.ND1-1", "KY21.Chr1.1014.v2.ND2-1", "KY21.Chr1.1015.v1.SL1-1", "KY21.Chr1.1016.v1.SL1-1", "KY21.Chr1.1018.v1.nonSL1-1", "KY21.Chr1.1019.v1.SL1-1", "KY21.Chr1.102.v1.nonSL2-1", "KY21.Chr1.1020.v1.SL1-1", "KY21.Chr1.1022.v1.ND1-1", "KY21.Chr1.1024.v1.SL1-1", "KY21.Chr1.1027.v2.nonSL3-1", "KY21.Chr1.1029.v1.SL1-1", "KY21.Chr1.1031.v1.ND1-1", "KY21.Chr1.1035.v1.SL1-1", "KY21.Chr1.1040.v1.nonSL7-1", "KY21.Chr1.1041.v1.nonSL2-1", "KY21.Chr1.1042.v1.nonSL4-1", "KY21.Chr1.1045.v1.SL1-1", "KY21.Chr1.1046.v1.SL1-1", "KY21.Chr1.1050.v1.SL1-1", "KY21.Chr1.1051.v1.SL1-1", "KY21.Chr1.1055.v1.SL1-1", "KY21.Chr1.1056.v1.nonSL3-1…
$ protein_id_gene <chr> "KY21.Chr1.10", "KY21.Chr1.1000", "KY21.Chr1.1001", "KY21.Chr1.1002", "KY21.Chr1.1004", "KY21.Chr1.1005", "KY21.Chr1.1006", "KY21.Chr1.1009", "KY21.Chr1.101", "KY21.Chr1.1010", "KY21.Chr1.1011", "KY21.Chr1.1012", "KY21.Chr1.1013", "KY21.Chr1.1014", "KY21.Chr1.1015", "KY21.Chr1.1016", "KY21.Chr1.1018", "KY21.Chr1.1019", "KY21.Chr1.102", "KY21.Chr1.1020", "KY21.Chr1.1022", "KY21.Chr1.1024", "KY21.Chr1.1027", "KY21.Chr1.1029", "KY21.Chr1.1031", "KY21.Chr1.1035", "KY21.Chr1.1040", "KY21.Chr1.1041", "KY21.Chr1.1042", "KY21.Chr1.1045", "KY21.Chr1.1046", "KY21.Chr1.1050", "KY21.Chr1.1051", "KY21.Chr1.1055", "KY21.Chr1.1056", "KY21.Chr1.1057", "KY21.Chr1.1058", "KY21.Chr1.1060", "KY21.Chr1.1061", "KY21.Chr1.1062", "KY21.Chr1.1063", "KY21.Chr1.1066", "KY21.Chr1.107", "KY21.Chr1.1072", "KY21.Chr1.1073", "KY21.Chr1.1077", "KY21.Chr1.1078", "KY21.Chr1.108", "KY21.Chr1.1085", "KY21.Chr1.1088", "KY21.Chr1.1089", "KY21.Chr1.109", "KY21.Chr1.1093", "KY21.Chr1.1095", "KY…
$ human_id <chr> "TAF9", "PSMB7", "LRRC47", "TDRD1", "DNAJC2", "MYO18B", "NUAK1", "SUMF1", "NAA10", "GAS2", "SLC9A9", "SPEN", "SPEN", "XPO7", "RFK", "CPOX", "ABRAXAS2", "PPM1G", "DNAJB9", "MGME1", "CCDC15", "CEP85", "GNAI1", "UVRAG", "EPC1", "DDX58", "APIP", "GPR155", "SH3GLB1", "TXN", "RBM33", "GSTM3", "CYRIB", "PRIM2", "NSMCE1", "TACC1", "ZNF143", "SCYL3", "DDX39B", "KIFAP3", "NR6A1", "COG7", "WDR33", "RPL8", "ZSWIM8", "TTC21B", "GRB2", "CYB5D1", "DEPDC5", "SRPRB", "DCUN1D1", "ENKUR", "ACAP2", "AMBRA1", "COPB2", "GRAMD1A", "TFDP1", "VAMP3", "TCEA3", "COP1", "ERI1", "NPHP4", "BICD2", "PTPMT1", "MCM4", "NINL", "GNB1L", "PUS7L", "PRKAR2A", "HSP90B1", "AGL", "SYCE2", "TEKT2", "GMNN", "VPS28", "CTSC", "TTPAL", "C12orf4", "UVSSA", "CIB1", "XPNPEP1", "KIF3B", "ZNF830", "CYB5R2", "CASK", "RAB23", "DDX19B", "EXOC4", "TCEA1", "NR2C2AP", "MRRF", "BARD1", "MFAP1", "IRAK1", "DLD", "MRPL52", "RAB7A", "USP15", "MMAB", "SLC25A11", "GREB1", "MYT1L", "SNX33", "GCC1", "MED4", "NUP98", "B…
$ hgnc <chr> "HGNC:11542", "HGNC:9544", "HGNC:29207", "HGNC:11712", "HGNC:13192", "HGNC:18150", "HGNC:14311", "HGNC:20376", "HGNC:18704", "HGNC:4167", "HGNC:20653", "HGNC:17575", "HGNC:17575", "HGNC:14108", "HGNC:30324", "HGNC:2321", "HGNC:28975", "HGNC:9278", "HGNC:6968", "HGNC:16205", "HGNC:25798", "HGNC:25309", "HGNC:4384", "HGNC:12640", "HGNC:19876", "HGNC:19102", "HGNC:17581", "HGNC:22951", "HGNC:10833", "HGNC:12435", "HGNC:27223", "HGNC:4635", "HGNC:25216", "HGNC:9370", "HGNC:29897", "HGNC:11522", "HGNC:12928", "HGNC:19285", "HGNC:13917", "HGNC:17060", "HGNC:7985", "HGNC:18622", "HGNC:25651", "HGNC:10368", "HGNC:23528", "HGNC:25660", "HGNC:4566", "HGNC:26516", "HGNC:18423", "HGNC:24085", "HGNC:18184", "HGNC:28388", "HGNC:16469", "HGNC:25990", "HGNC:2232", "HGNC:29305", "HGNC:11749", "HGNC:12644", "HGNC:11615", "HGNC:17440", "HGNC:23994", "HGNC:19104", "HGNC:17208", "HGNC:26965", "HGNC:6947", "HGNC:29163", "HGNC:4397", "HGNC:25276", "HGNC:9391", "HGNC:12028", "H…
$ number_of_peptides <dbl> 5, 13, 15, 6, 21, 9, 1, 12, 9, 1, 2, 24, 5, 25, 1, 11, 3, 11, 2, 2, 2, 10, 7, 2, 3, 8, 2, 1, 8, 4, 8, 17, 6, 14, 3, 13, 1, 4, 23, 3, 5, 13, 9, 34, 7, 11, 1, 2, 2, 17, 12, 1, 12, 2, 39, 5, 6, 2, 4, 5, 5, 4, 7, 4, 30, 8, 2, 17, 7, 142, 23, 1, 1, 7, 6, 14, 15, 5, 5, 4, 13, 9, 4, 7, 10, 3, 17, 10, 10, 1, 5, 1, 6, 6, 54, 4, 14, 32, 5, 19, 3, 1, 8, 10, 1, 5, 10, 4, 2, 2, 25, 2, 5, 64, 3, 5, 7, 10, 1, 1, 2, 5, 9, 26, 6, 22, 5, 8, 2, 1, 4, 27, 1, 1, 33, 19, 1, 13, 11, 1, 4, 4, 2, 15, 2, 20, 8, 24, 3, 2, 3, 4, 24, 15, 5, 3, 1, 1, 6, 1, 3, 24, 19, 9, 1, 26, 7, 8, 5, 1, 42, 11, 1, 13, 2, 4, 29, 22, 3, 7, 6, 2, 1, 13, 19, 3, 28, 8, 1, 4, 15, 9, 3, 25, 11, 7, 4, 14, 2, 9, 6, 7, 8, 10, 3, 5, 6, 1, 6, 1, 3, 1, 20, 16, 28, 4, 2, 3, 11, 1, 18, 2, 2, 3, 6, 6, 9, 2, 10, 8, 4, 2, 1, 1, 4, 4, 3, 7, 1, 5, 24, 1, 5, 2, 23, 3, 2, 15, 18, 17, 3, 15, 2, 1, 18, 2, 4, 7, 4, 13, 9, 10, 1, 21, 2, 15, 8, 2, 2, 10, 8, 3, 4, 29, 10, 4, 10, 38, 3, 3, 4, 3, 12, 12, 4, 53, 15, 3, 4, 1, 8, …
$ unfE <dbl> 1.298049e-01, 1.324528e-01, 1.127597e-01, 4.200519e-01, 1.134175e-01, 1.252194e-01, 2.032697e-01, 1.710183e-01, 1.044630e-01, 2.785118e-01, 8.699938e-02, 1.063616e-01, 1.302725e-01, 1.226167e-01, 1.333502e-01, 1.036764e-01, 9.848785e-02, 1.014728e-01, 3.841175e-02, 1.834680e-01, 5.557100e-02, 1.349891e-01, 1.034271e-01, 9.713907e-02, 1.179749e-01, 7.741159e-02, 7.770750e-02, 2.865548e-01, 1.394184e-01, 9.014801e-01, 5.926869e-02, 1.230879e-01, 9.073042e-02, 1.335176e-01, 1.312359e-01, 1.773455e-01, 1.276592e-01, 7.315110e-02, 8.445418e-02, 1.020476e-01, 1.649247e-02, 1.067782e-01, 1.031701e-01, 1.232679e-01, 4.209236e-02, 1.123042e-01, 8.366374e-01, 4.596535e-02, 1.413864e-01, 1.046384e-01, 1.159262e-01, 7.884029e-03, 8.040522e-02, 7.663698e-02, 1.102994e-01, 5.699762e-02, 1.145377e-01, 1.222085e-01, 3.739114e-02, 1.243875e-01, 1.223155e-01, 1.300714e-01, 1.174618e-01, 6.797665e-02, 1.264099e-01, 9.751044e-02, 1.925553e-01, 1.231327e-01, 9.484711e-02, 1.…
$ fertE <dbl> 0.1259673628, 0.1348872849, 0.1070208546, 0.3998787072, 0.1100268958, 0.1254946261, 0.1764491823, 0.1625296473, 0.1005591244, 0.3124981464, 0.0828260298, 0.1087763791, 0.1128248903, 0.1212120565, 0.1294932728, 0.1063921248, 0.1232894869, 0.0981483299, 0.0368863364, 0.1340376814, 0.0594995071, 0.1437564947, 0.1012506983, 0.0854047794, 0.1060651322, 0.0699248526, 0.0832902489, 0.0813895791, 0.1299898756, 0.0637744491, 0.0747610127, 0.1226230278, 0.0611645675, 0.1283378832, 0.1231017306, 0.1780246180, 0.1111774146, 0.0719104431, 0.0895344843, 0.1074715510, 0.0139331301, 0.1126086854, 0.1077540860, 0.1199514854, 0.0422573796, 0.1097822787, 0.0635442331, 0.0598222959, 0.1198976977, 0.1119800015, 0.1040114955, 0.1516362856, 0.0730188450, 0.0775572143, 0.1062961899, 0.0540386207, 0.1176930579, 0.1207845253, 0.0774454593, 0.1257390141, 0.1268401582, 0.1281410426, 0.1206433908, 0.0818736458, 0.1297553262, 0.1042516577, 0.1108786442, 0.1180802420, 0.1269517666, 0.…
$ cell16 <dbl> 0.1250456449, 0.1328101843, 0.1217596402, 0.0641194480, 0.1233892497, 0.1696345681, 0.0280065481, 0.1660856178, 0.1014044612, 0.0492090452, 0.0974244402, 0.1091126988, 0.1175400224, 0.1278746571, 0.1197704838, 0.1216388784, 0.1373451292, 0.1149117477, 0.0794878241, 0.1219411609, 0.0993291963, 0.1215333709, 0.1253880041, 0.1187471576, 0.1182247980, 0.1025463113, 0.0964897295, 0.0930370244, 0.1268794777, 0.0103578272, 0.1428441909, 0.1381293030, 0.1011210562, 0.1412464172, 0.1373088402, 0.1406442782, 0.1264197227, 0.0864787410, 0.1154138351, 0.1032854792, 0.0102171236, 0.1207738544, 0.1329623983, 0.1294206162, 0.0559302019, 0.1182662662, 0.0004799518, 0.0843884353, 0.1061593830, 0.1251092266, 0.1279873533, 0.0538486900, 0.0930475763, 0.0956709580, 0.1173102106, 0.0644276984, 0.1169321320, 0.1476430094, 0.0497483641, 0.1295388302, 0.1335797736, 0.1525073846, 0.1303249884, 0.1226622137, 0.1316943812, 0.1234315218, 0.2190476699, 0.1306975610, 0.1181505037, 0.…
$ iniG <dbl> 1.201025e-01, 1.146366e-01, 1.177441e-01, 1.547713e-02, 1.218467e-01, 1.474212e-01, 2.431692e-02, 9.674655e-02, 1.233771e-01, 4.386662e-02, 1.181640e-01, 1.178227e-01, 1.057373e-01, 1.272844e-01, 1.531695e-01, 1.195095e-01, 1.380446e-01, 1.311223e-01, 9.721245e-02, 9.179619e-02, 1.814810e-01, 6.419875e-02, 1.148672e-01, 1.187910e-01, 9.138235e-02, 1.237285e-01, 1.307980e-01, 4.581240e-02, 1.183398e-01, 9.455054e-04, 1.274477e-01, 1.149075e-01, 1.053072e-01, 1.377693e-01, 1.178903e-01, 1.151039e-01, 1.395133e-01, 1.196574e-01, 1.046477e-01, 1.175316e-01, 9.691588e-03, 1.371220e-01, 1.299788e-01, 1.199843e-01, 7.834961e-02, 1.035352e-01, 5.807113e-04, 1.337454e-01, 9.920607e-02, 1.298941e-01, 1.307009e-01, 1.459647e-01, 8.871721e-02, 1.431424e-01, 1.181252e-01, 1.865804e-02, 1.107338e-01, 1.202856e-01, 4.768982e-02, 1.107616e-01, 1.192751e-01, 1.614690e-01, 1.185387e-01, 1.790921e-01, 1.286794e-01, 1.580947e-01, 9.062976e-02, 9.979766e-02, 1.161346e-01, 1.…
$ latN <dbl> 1.249958e-01, 1.176930e-01, 1.225389e-01, 2.334494e-02, 1.288561e-01, 1.213391e-01, 1.353770e-01, 9.880142e-02, 1.310360e-01, 5.664927e-02, 1.374202e-01, 1.235143e-01, 1.288313e-01, 1.342176e-01, 1.268391e-01, 1.267021e-01, 1.239978e-01, 1.351063e-01, 1.402561e-01, 9.585023e-02, 2.124226e-01, 8.335679e-02, 1.101131e-01, 1.253734e-01, 1.305984e-01, 1.311726e-01, 1.197510e-01, 1.764569e-02, 1.215315e-01, 4.940914e-04, 1.506236e-01, 1.193120e-01, 1.270588e-01, 1.448363e-01, 1.332241e-01, 1.062659e-01, 1.346756e-01, 1.250050e-01, 1.282348e-01, 1.346201e-01, 2.841909e-01, 1.434135e-01, 1.265789e-01, 1.242013e-01, 1.472287e-01, 1.243490e-01, 5.255602e-04, 1.565178e-01, 9.022618e-02, 1.287399e-01, 1.266559e-01, 6.983314e-02, 1.225538e-01, 1.540599e-01, 1.230494e-01, 2.006765e-02, 1.340700e-01, 1.140707e-01, 3.747232e-02, 1.302608e-01, 1.321272e-01, 1.442147e-01, 1.120498e-01, 1.586591e-01, 1.318796e-01, 1.559365e-01, 7.219405e-02, 1.118971e-01, 1.142572e-01, 1.…
$ midTII <dbl> 0.1077732493, 0.1183431058, 0.1298416904, 0.0248672053, 0.1355945589, 0.1160426877, 0.1456811696, 0.1036418491, 0.1337028314, 0.0570461601, 0.1366937320, 0.1364715547, 0.1288377428, 0.1284267381, 0.1200899658, 0.1320717241, 0.1336096975, 0.1320604291, 0.1880265698, 0.1428987075, 0.1617986810, 0.1106779343, 0.1234949652, 0.1510832931, 0.1378754043, 0.1443671826, 0.1233777760, 0.0712951320, 0.1154652964, 0.0008680424, 0.1470098516, 0.1131466747, 0.1380976013, 0.1181519524, 0.1158007347, 0.0952046470, 0.1282343305, 0.1563352621, 0.1403727657, 0.1335504432, 0.2280586429, 0.1329982158, 0.1354347087, 0.1261179185, 0.1973239848, 0.1417507407, 0.0002929388, 0.1647420625, 0.1147942878, 0.1372483475, 0.1228574088, 0.1052463644, 0.1555390226, 0.1789083710, 0.1404050126, 0.0693326818, 0.1315960484, 0.1063854656, 0.0758183433, 0.1255273590, 0.1216818575, 0.1121900324, 0.1006626954, 0.1607308671, 0.1264820198, 0.1593752228, 0.1028987043, 0.1110920667, 0.1343382230, 0.…
$ latTII <dbl> 0.1323855417, 0.1281513763, 0.1408600114, 0.0300239069, 0.1314632254, 0.0978353289, 0.1428260352, 0.1068325396, 0.1543988512, 0.1198226208, 0.1680491776, 0.1528819631, 0.1364129942, 0.1176940707, 0.1062285554, 0.1453782815, 0.1196228233, 0.1461882747, 0.1851877048, 0.1131266179, 0.1335724529, 0.1536245883, 0.1469536031, 0.1508592978, 0.1387443682, 0.1650351834, 0.1915226408, 0.1288464160, 0.1228062631, 0.0020578708, 0.1662939088, 0.1322169302, 0.1446448857, 0.1055998584, 0.1186735515, 0.1033045166, 0.1057741735, 0.1632213358, 0.1698592292, 0.1520053098, 0.2063258368, 0.1218006127, 0.1312772641, 0.1277019046, 0.2667397474, 0.1503908711, 0.0415974944, 0.1784937759, 0.1570438987, 0.1238861960, 0.1314015102, 0.2243778449, 0.1804974007, 0.1473179842, 0.1323612046, 0.2453531835, 0.1324797239, 0.1240563168, 0.2335502275, 0.1205220858, 0.1211919003, 0.0938088059, 0.1455032273, 0.1187722384, 0.1198381711, 0.1202442661, 0.1245907415, 0.1501543622, 0.1372269508, 0.…
$ larva <dbl> 0.1339249741, 0.1210256914, 0.1474750562, 0.0222367440, 0.1354058320, 0.0970131460, 0.1440734634, 0.0943440902, 0.1510585525, 0.0823963035, 0.1724230172, 0.1450587562, 0.1395431677, 0.1206737802, 0.1110589490, 0.1446310143, 0.1256025887, 0.1409897847, 0.2345312541, 0.1168814599, 0.0963254769, 0.1878629887, 0.1745054139, 0.1526020050, 0.1591346739, 0.1858137758, 0.1770630576, 0.2754189458, 0.1255694383, 0.0200221148, 0.1317509598, 0.1365767000, 0.2318754915, 0.0905406482, 0.1227648170, 0.0841066657, 0.1265461499, 0.2042406962, 0.1674829964, 0.1494879759, 0.2310902780, 0.1245050561, 0.1328438229, 0.1293544904, 0.1700780469, 0.1396214249, 0.0563416675, 0.1763249112, 0.1712860526, 0.1385037780, 0.1404591979, 0.2412089287, 0.2062209235, 0.1267061709, 0.1521534767, 0.4711245047, 0.1419574824, 0.1445659451, 0.4408843193, 0.1332628136, 0.1229884698, 0.0775975991, 0.1548153468, 0.1102331479, 0.1052612427, 0.0811557235, 0.0872051517, 0.1551483014, 0.1580936807, 0.…
vis_dat(ciona_prot.df) # not all Proteins are mapped to Human orthologsvis_miss(ciona_prot.df)# ## loading annoatation from gff3 table - TFs, signalling molecules, genes names and symbols etc.
# annotation.gff3.df <-
# as_tibble(read_excel("/Users/mariossi/Desktop/Annotation/final/df_selex&gff3.xlsx", sheet = "df_selex&gff3", range = cell_cols("A:I"), col_names=TRUE))
#
# # Merging protein dynamics with annotaion table
# KY.TMTproC.df[,which(KY.TMTproC.df$Protein.Id.gene %in% annotation.gff3.df$KY.gene)]
# KY.TMTproC.df <- left_join(KY.TMTproC.df,annotation.df, by = c("Protein.Id.gene" = "KY.gene.model"),keep =T)
#
###############################
#### Summary univariate stats
###############################
descr(ciona_prot.df,
headings = FALSE, # remove headings
stats = "common" # most common descriptive statistics
)
cell16 fertE iniG larva latN latTII midTII
--------------- --------- --------- --------- --------- --------- --------- ---------
Mean 0.11 0.10 0.11 0.17 0.12 0.15 0.13
Std.Dev 0.04 0.05 0.04 0.11 0.04 0.06 0.04
Min 0.00 0.00 0.00 0.00 0.00 0.00 0.00
Median 0.12 0.10 0.12 0.14 0.13 0.14 0.13
Max 0.91 0.61 0.78 0.97 0.67 0.70 0.57
N.Valid 7095.00 7095.00 7095.00 7095.00 7095.00 7095.00 7095.00
Pct.Valid 100.00 100.00 100.00 100.00 100.00 100.00 100.00
Table: Table continues below
number_of_peptides unfE
--------------- -------------------- ---------
Mean 8.80 0.10
Std.Dev 15.85 0.07
Min 1.00 0.00
Median 5.00 0.11
Max 841.00 0.93
N.Valid 7095.00 7095.00
Pct.Valid 100.00 100.00
view(dfSummary(ciona_prot.df[, 5:13]))
# check quantiles
ciona_prot.df %>%
dplyr::select(protein_id_transcript, number_of_peptides) %>%
add_column(group = "same") %>%
# group_by(group) %>%
summarise(
mean = mean(number_of_peptides),
median = median(number_of_peptides),
min = min(number_of_peptides),
max = max(number_of_peptides),
range = diff(range(number_of_peptides, na.rm = TRUE)),
n_distinct = n_distinct(number_of_peptides),
quantile = quantile(number_of_peptides),
) # |> unnest_wider(quantile)# A tibble: 5 × 7
mean median min max range n_distinct quantile
<dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
1 8.80 5 1 841 840 96 1
2 8.80 5 1 841 840 96 2
3 8.80 5 1 841 840 96 5
4 8.80 5 1 841 840 96 11
5 8.80 5 1 841 840 96 841
# sum values per stage => higher in late developing
ciona_prot.df %>% summarise(across(where(is.numeric), sum))# A tibble: 1 × 9
number_of_peptides unfE fertE cell16 iniG latN midTII latTII larva
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 62471 744. 695. 768. 775. 857. 934. 1095. 1227.
1.3 Peptides & proteins quantification
###############################
#### number of peptides per protein
###############################
# descriptive statistics for peptides
ciona_prot.df %>% summarise(tot_peptides = sum(number_of_peptides))# A tibble: 1 × 1
tot_peptides
<dbl>
1 62471
# we identified 62471 peptides
# % peptites distribution table
peptides_freqTable <-
ciona_prot.df %>%
# drop_na(Protein.Id.transcript) %>%
arrange(desc(number_of_peptides)) %>%
group_by(number_of_peptides) %>%
summarise(n = n()) %>%
mutate(count = number_of_peptides) %>%
group_by(count = case_when(count >= 50 ~ ">=50", TRUE ~ as.character(count))) %>% # sum protein count if peptides higher than 50
mutate(value = sum(n)) %>% # mutate(value = sum(n), .groups = 'drop') %>%
ungroup() %>%
filter(number_of_peptides <= 50) %>%
mutate(
freq = prop.table(n),
freq = value / sum(value), # same way
rel.freq = paste0(round(100 * n / sum(n), 1), "%")
)
peptides_freqTable# A tibble: 50 × 6
number_of_peptides n count value freq rel.freq
<dbl> <int> <chr> <int> <dbl> <chr>
1 1 1122 1 1122 0.158 16%
2 2 909 2 909 0.128 13%
3 3 627 3 627 0.0884 8.9%
4 4 575 4 575 0.0810 8.2%
5 5 494 5 494 0.0696 7.1%
6 6 414 6 414 0.0584 5.9%
7 7 372 7 372 0.0524 5.3%
8 8 303 8 303 0.0427 4.3%
9 9 247 9 247 0.0348 3.5%
10 10 249 10 249 0.0351 3.6%
# ℹ 40 more rows
### Plotting peptides distribution
plot_peptides_freqTable <- peptides_freqTable %>%
# ggplot(aes(reorder(as.factor(n),-n),freq)) +
ggplot(aes(number_of_peptides, freq)) +
# geom_bar(stat="identity",position = "identity", alpha= .50, fill = "grey78") +
geom_col(aes(fill = n), width = 0.9, show.legend = F,fill = "grey78") +
geom_point() +
geom_vline(aes(xintercept = 8.80), color = "blue", linetype = "dashed", size = 0.7) + # mean
geom_vline(aes(xintercept = 5), color = "red", linetype = 3, size = 0.7) + # median
geom_text(aes(label = value), vjust = -1, angle = 45) +
scale_y_continuous(labels = scales::percent, expand = c(0, 0.001), limits = c(0, 0.17)) +
scale_x_continuous(
breaks = peptides_freqTable$number_of_peptides,
labels = peptides_freqTable$count,
expand = c(0.01, 0)
) +
#scale_fill_gradientn(colours = wes_palette("Darjeeling1", 100, type = "continuous")) +
labs( # title= "Number of peptides per quantified protein",
x = "Number of peptides per quantified protein ",
y = "Frequency",
# subtitle = "Total peptides: 62.471 \nTotal protein: 7.095"
) +
annotate(geom = "text", x = 6.5, y = 0.15, label = "median", size = 5, color = "red", angle = -90) +
annotate(geom = "text", x = 10, y = 0.15, label = "mean", size = 5, color = "blue", angle = -90) +
annotate(geom = "text", x = 40, y = 0.16, label = "Total protein: 7.095", size = 5, color = "black") +
annotate(geom = "text", x = 40, y = 0.15, label = "Total unique protein: 7.057", size = 5, color = "black") +
annotate(geom = "text", x = 40, y = 0.14, label = "Total peptides: 62.471", size = 5, color = "black") +
theme_personalised +
theme(axis.text.x=element_text(angle=45, hjust=1))
plot_peptides_freqTable###############################
#### number of times a peptide per protein
###############################
# number of peptides per quantified protein
ciona_prot.df %>%
drop_na(protein_id_transcript) %>%
arrange(desc(number_of_peptides)) %>%
group_by(number_of_peptides) %>%
dplyr::count(number_of_peptides) %>%
# filter(n < 50) %>%
mutate(number_of_peptides = as.factor(number_of_peptides)) %>%
ggplot(aes(x = fct_reorder(number_of_peptides, -n), y = n)) +
geom_col(aes(fill = n), width = 0.9, show.legend = F) +
scale_x_discrete(expand = c(0.01, 0)) +
scale_y_continuous(expand = c(0.02, 0)) +
scale_fill_gradientn(colours = wes_palette("Darjeeling1", 100, type = "continuous")) +
ggtitle(label="",
subtitle = "Peptides per protein") +
theme(
axis.text.x = element_text(angle = 45, vjust = 0.1, hjust = 1),
axis.ticks.length = unit(0, "cm")
) +
labs(
x = "Peptides #",
y = "Protein #"
) +
theme_personalised +
theme(axis.text.x=element_text(angle=45, hjust=1))###############################
#### Circular plot - top peptides x protein
###############################
plot_peptides_per_protein <- ciona_prot.df %>%
arrange(desc(number_of_peptides)) %>%
dplyr::select(protein_id_transcript, human_id, number_of_peptides) %>%
mutate(id = seq(1, nrow(.))) %>%
filter(id < 25) %>%
ggplot(aes(x = reorder(as.factor(human_id), number_of_peptides), y = number_of_peptides)) +
# geom_col(aes(fill = reorder(as.factor(Protein.Id.gene),number_of_peptides))) +
# scale_fill_viridis_d(option = "A") +
coord_flip() +
geom_col(aes(fill = id)) +
scale_fill_gradientn(colours = rev(wes_palette("Royal1", 25, type = "continuous"))) +
scale_y_continuous(breaks = c(70, 100, 150, 200, 400, 500, 800, 900)) +
labs(x = "", y = "# peptides per protein") +
theme_personalised +
theme(axis.text.x=element_text(size = 14, angle=0, hjust=1.5),
legend.position = "none",
panel.grid.major.x = element_line(color = "black", linewidth = 0.4, linetype = 2),
axis.title.y = element_text(size = 16)
) +
coord_polar(start = 0)
plot_peptides_per_proteincairo_pdf("plot_peptides_freqTable.pdf", width=8.5, height=5.6)
print(plot_peptides_freqTable)
invisible(dev.off())
cairo_pdf("plot_peptides_per_protein.pdf", width=8.5, height=5.6)
print(plot_peptides_per_protein)
invisible(dev.off())1.4 Mapping isoforms
###############################
#### Plotting isoforms : checking multiple peptides matching same protein
###############################
len_uniq(ciona_prot.df$protein_id_gene) # Number of unique protein - 7057[1] 7057
protein_isoform.list <- unique(ciona_prot.df$protein_id_gene[duplicated(ciona_prot.df$protein_id_gene)]) # list of isoforms: 35 protein have multiple isoforms
###
# Plotting isoforms dynamics
###
# protein_isoform_only <- left_join(protein_isoform_only,annotation.df, by = c("Protein.Id.gene" = "KY.gene.model"),keep =T) # not needed
protein_isoform_only <- ciona_prot.df %>%
get_dupes(protein_id_gene) %>%
arrange(desc(dupe_count))
# unite("protein_id_gene", protein_id_gene, gene.name.KY, extra.info, sep= "-") %>%
# mutate(protein_id_gene = gsub("-NA",replacement = "", protein_id_gene), protein_id_gene = gsub("KY21.",replacement = "", protein_id_gene))
## Summary with number of isoforms per protein
protein_isoform_only %>%
group_by(protein_id_gene) %>%
summarise(dupe_count = length(protein_id_gene)) %>%
arrange(desc(dupe_count)) # |># A tibble: 35 × 2
protein_id_gene dupe_count
<chr> <int>
1 KY21.Chr10.215 4
2 KY21.Chr5.818 3
3 KY21.Chr1.1682 2
4 KY21.Chr1.2263 2
5 KY21.Chr1.326 2
6 KY21.Chr1.670 2
7 KY21.Chr10.1001 2
8 KY21.Chr11.1071 2
9 KY21.Chr11.568 2
10 KY21.Chr11.747 2
# ℹ 25 more rows
# pull(dupe_count) %>%
# sum()
# Filter the duplicates isoform matching different human genes
protein_isoform_duplicates.human_id <-
protein_isoform_only %>%
group_by(protein_id_gene) %>%
filter(duplicated(protein_id_gene) & human_id != lag(human_id)) %>%
ungroup()
# clean the name
protein_isoform_duplicates_all.human_id <-
protein_isoform_only %>%
filter(protein_id_gene %in% protein_isoform_duplicates.human_id$protein_id_gene) |>
group_by(protein_id_gene) %>%
summarize(human_id = paste(unique(human_id), collapse = " - ")) |>
mutate(protein_id_gene_human_id = paste0(protein_id_gene, " - ", human_id))
p <- protein_isoform_only %>%
pivot_longer(cols = 7:14, names_to = "stage", names_transform = list(stage = as.factor), values_to = "concentration") %>%
mutate(hpf = as.numeric(as.character(recode(stage, `unfE` = "0", `fertE` = "24", `cell16` = "141", `iniG` = "260", `latN` = "434", `midTII` = "632", `latTII` = "796", `larva` = "1038")))) %>%
mutate(protein_id_gene_human_id = paste0(protein_id_gene, " - ", human_id)) |>
left_join(protein_isoform_duplicates_all.human_id, by = c("protein_id_gene")) %>%
mutate(protein_id_gene_human_id.x = if_else(!is.na(human_id.y),
protein_id_gene_human_id.y,
protein_id_gene_human_id.x
)) |>
dplyr::rename(protein_id_gene_human_id = protein_id_gene_human_id.x) |>
# slice_head(n = 50) %>%
ggplot(aes(hpf, concentration)) +
facet_grid(protein_id_gene ~ ., scales = "free_y", ) +
geom_point(aes(group = protein_id_transcript, colour = protein_id_gene)) +
geom_line(aes(group = protein_id_transcript, colour = protein_id_gene)) +
scale_color_manual(values = isoform_palette) +
labs(x = "Time (hr)", y = "Relative expression") +
scale_x_continuous(name = "Hours post fertilization", labels = c("0", "0.4", "2.65", "4.5", "7.4", "10.9", "13.5", "17.5"), breaks = c(0, 24, 141, 260, 434, 632, 796, 1038)) +
theme_personalised +
theme(legend.position = "none",
strip.background = element_rect(fill = "white"),
strip.text = element_text(colour = "black"),
strip.text.y = element_text(angle = 0, size = 8))
p_fin <- p + facet_rep_wrap(~protein_id_gene_human_id, #scales = "free_y",
repeat.tick.labels = "left") &
theme_personalised &
theme(legend.position = "none",
axis.text.x=element_text(angle=45, hjust=1),
axis.title = element_text(size= 20))
p_fincairo_pdf("plot_isofomsDynamics.pdf", width=12, height=8)
print(p_fin)
invisible(dev.off())1.5 Top and bottom proteins expressed by stage
###############################
#### Top and bottom proteins expressed by stage
###############################
# top 10 most abundant entries for each stage.
top_entries <- ciona_prot.df %>%
pivot_longer(cols = 6:13, names_to = "stage", names_transform = list(stage = as.factor), values_to = "concentration") %>%
mutate(entry = coalesce(human_id, protein_id_gene)) %>%
arrange(stage, desc(concentration)) %>%
group_by(stage) %>%
slice_max(order_by = concentration, n = 10, with_ties = FALSE)
bottom_entries <- ciona_prot.df %>%
pivot_longer(cols = 6:13, names_to = "stage", names_transform = list(stage = as.factor), values_to = "concentration") %>%
mutate(entry = coalesce(human_id, protein_id_gene)) %>%
arrange(stage, concentration) %>%
group_by(stage) %>%
slice_min(order_by = concentration, n = 10, with_ties = FALSE)
top_entries %>%
mutate(stage = factor(stage, levels = ciona_stages)) %>%
ggplot(aes(x = reorder(entry, concentration), y = concentration, fill = stage)) +
geom_bar(position = "dodge", stat = "identity") +
geom_point(aes(colour = stage)) +
facet_wrap(~stage, scales = "free") +
labs(title = "Top 10 most abundant", x = "Relative abundance", y = "") +
coord_flip() +
bottom_entries %>%
mutate(stage = factor(stage, levels = ciona_stages)) %>%
ggplot(aes(x = reorder(entry, concentration), y = concentration, fill = stage)) +
geom_bar(position = "dodge", stat = "identity") +
geom_point(aes(colour = stage)) +
labs(title = "Top 10 less abundant", y = "Relative abundance", x = "") +
facet_wrap(~stage, scales = "free") +
coord_flip() & theme_pubr(base_family = "Avenir") + theme(legend.position = "none")ciona_prot.df %>%
pivot_longer(-c(1:5), names_to = "stage", names_transform = as.factor) %>%
# filter(str_detect(stage, "_byRowCol")) %>%
# mutate(stage = str_remove(stage, "_byRowCol")) %>%
ggplot(aes(x = factor(stage, levels = ciona_stages), y = value, fill = stage, color = stage)) +
geom_line(aes(group = protein_id_transcript)) +
# geom_line(aes(group = protein_id_transcript)) +
geom_path(aes(group = protein_id_transcript)) +
geom_point(color = "black") +
scale_color_manual(values = setNames(ciona_stages_palette, ciona_stages), guide = "none") +
scale_fill_manual(values = setNames(ciona_stages_palette, ciona_stages), guide = "none") +
labs(x = "Developmental stages", y = "Relative abundance") +
theme_pubr(legend = "none", base_family = "Avenir")# Top 30 changing per stage
stage_pairs <- list(
c("unfE", "fertE"),
c("fertE", "cell16"),
c("cell16", "iniG"),
c("iniG", "latN"),
c("latN", "midTII"),
c("midTII", "latTII"),
c("latTII", "larva")
)
plots_up <- list()
for (pair in stage_pairs) {
selected_stages <- pair
df_filtered <- ciona_prot.df %>%
# filter(get(paste0(selected_stages[2], "_byRowCol")) > get(paste0(selected_stages[1], "_byRowCol"))) %>%
filter(get(paste0(selected_stages[2])) > get(paste0(selected_stages[1]))) %>%
pivot_longer(-c(1:5), names_to = "stage", names_transform = as.factor) %>%
# filter(str_detect(stage, "_byRowCol")) %>%
# mutate(stage = str_remove(stage, "_byRowCol")) %>%
# filter(str_detect(stage)) %>%
# mutate(stage = str_remove(stage)) %>%
filter(stage %in% selected_stages) %>%
group_by(protein_id_transcript) %>%
reframe(
protein_id_transcript = protein_id_transcript, human_id = human_id, stage = stage, value = value,
diff = value[stage == selected_stages[2]] - value[stage == selected_stages[1]]
) %>%
arrange(-diff) |>
slice_head(n = 60)
plot_title <- paste(selected_stages[1], selected_stages[2], sep = " - ")
plot <- ggplot(df_filtered, aes(
x = factor(stage, levels = ciona_stages), y = value,
group = protein_id_transcript, color = stage, label = human_id
)) +
geom_path() +
geom_point(size = 3) +
ylim(0, 1) +
scale_color_manual(values = c(setNames(ciona_stages_palette, ciona_stages), selected_stages = c("#009392FF", "#39B185FF", "#9CCB86FF", "#E9E29CFF", "#EEB479FF", "#E88471FF", "#CF597EFF")), guide = "none") +
labs(x = "", y = "Relative abundance") +
ggtitle(plot_title) +
theme_pubr(legend = "none", base_family = "Avenir") +
geom_text_repel(
data = function(df) {
df %>%
# group_by(stage) %>%
mutate(human_id = ifelse(human_id == "", protein_id_transcript, human_id)) |>
arrange(-value) %>%
slice_head(n = 30) |>
distinct(human_id, .keep_all = TRUE)
}, size = 2.5,
color = "grey",
box.padding = 0.2,
nudge_x = 1,
direction = "y",
verbose = F,
# hjust = "left",
max.overlaps = Inf,
max.time = 1,
max.iter = 1e5, segment.curvature = -0.1,
segment.ncp = 3,
segment.angle = 20
)
plots_up[[plot_title]] <- plot
}
# combined_plots_up <- wrap_plots(plotlist = plots_up, nrow = 1, align = "v")
# combined_plots_up
top_plot_2nd <- plots_up$`fertE - cell16` + plots_up$`cell16 - iniG` + plots_up$`iniG - latN` + plots_up$`latN - midTII` + plots_up$`midTII - latTII` + plots_up$`latTII - larva` + plot_layout(nrow = 1) & theme(axis.title.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.line.y = element_blank())
plot_combined_up <- wrap_plots(plots_up$`unfE - fertE`, top_plot_2nd, nrow = 1, widths = c(0.8, 5))
plots_down <- list()
for (pair in stage_pairs) {
selected_stages <- pair
df_filtered <- ciona_prot.df %>%
# filter(get(paste0(selected_stages[1], "_byRowCol")) > get(paste0(selected_stages[2], "_byRowCol"))) %>%
filter(get(selected_stages[1]) > get(selected_stages[2])) %>%
pivot_longer(-c(1:5), names_to = "stage", names_transform = as.factor) %>%
filter(stage %in% selected_stages) %>%
group_by(protein_id_transcript) %>%
reframe(
protein_id_transcript = protein_id_transcript, human_id = human_id, stage = stage, value = value,
diff = value[stage == selected_stages[1]] - value[stage == selected_stages[2]]
) %>%
arrange(-diff) |>
slice_head(n = 60)
plot_title <- paste(selected_stages[1], selected_stages[2], sep = " - ")
plot <- ggplot(df_filtered, aes(
x = factor(stage, levels = ciona_stages), y = value,
group = protein_id_transcript, color = stage, label = human_id
)) +
geom_path() +
geom_point(size = 3) +
ylim(0, 1) +
scale_color_manual(values = c(setNames(ciona_stages_palette, ciona_stages), selected_stages = c("#009392FF", "#39B185FF", "#9CCB86FF", "#E9E29CFF", "#EEB479FF", "#E88471FF", "#CF597EFF")), guide = "none") +
labs(x = "", y = "Relative abundance") +
ggtitle(plot_title) +
theme_pubr(legend = "none", base_family = "Avenir") +
geom_text_repel(
data = function(df) {
df %>%
# group_by(stage) %>%
mutate(human_id = ifelse(human_id == "", protein_id_transcript, human_id)) |>
arrange(diff) %>%
slice_head(n = 30) |>
distinct(human_id, .keep_all = TRUE)
}, size = 2.5,
box.padding = 0.2,
nudge_x = 1,
direction = "y",
verbose = F,
# hjust = "left",
max.overlaps = Inf,
color = "grey",
max.time = 1,
max.iter = 1e5,
segment.curvature = -0.1
)
plots_down[[plot_title]] <- plot
}
# combined_plots_down <- wrap_plots(plotlist = plots_down, nrow = 1, align = "v")
# combined_plots_down
bottom_plot_2nd <- plots_down$`fertE - cell16` + plots_down$`cell16 - iniG` + plots_down$`iniG - latN` + plots_down$`latN - midTII` + plots_down$`midTII - latTII` + plots_down$`latTII - larva` + plot_layout(nrow = 1) & theme(axis.title.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.line.y = element_blank())
plot_combined_down <- wrap_plots(plots_down$`unfE - fertE`, bottom_plot_2nd, nrow = 1, widths = c(0.8, 5))
plot_TopBottom_protein <- plot_combined_up / plot_combined_down
plot_TopBottom_proteincairo_pdf("plot_TopBottom_protein.pdf", width=8.5, height=5.6)
print(plot_TopBottom_protein)
invisible(dev.off())
#_______________________________
# UnfE to ferE - down
ciona_prot.df %>%
filter(get("unfE") > get("fertE")) %>%
pivot_longer(-c(1:5), names_to = "stage", names_transform = as.factor) %>%
filter(stage %in% c("unfE","fertE")) %>%
group_by(protein_id_transcript) %>%
reframe(protein_id_transcript = protein_id_transcript, human_id = human_id, stage = stage, value = value,hgnc =hgnc,
diff = value[stage == "unfE"] - value[stage == "fertE"]) %>%
arrange(-diff) |>
pivot_wider(1:3, names_from = stage,values_from = value ) |>
write.table(paste0(directory_figures,"unf-fert-DEP_down.tsv"), sep =" \t", col.names = T, row.names = F,quote = F)
# slice_head(n = 60)
# cell16 to iniG - up
ciona_prot.df %>%
filter(get("iniG") > get("cell16")) %>%
pivot_longer(-c(1:5), names_to = "stage", names_transform = as.factor) %>%
filter(stage %in% c("cell16","iniG")) %>%
group_by(protein_id_transcript) %>%
reframe(protein_id_transcript = protein_id_transcript, human_id = human_id, stage = stage, value = value,hgnc =hgnc,
diff = value[stage == "iniG"] - value[stage == "cell16"]) %>%
arrange(-diff) |>
pivot_wider(1:3, names_from = stage,values_from = value ) |>
write.table(paste0(directory_figures,"cell16-iniG-DEP_up.tsv"), sep =" \t", col.names = T, row.names = F,quote = F)
# slice_head(n = 60)
# latTII to larva - up
ciona_prot.df %>%
filter(get("larva") > get("latTII")) %>%
pivot_longer(-c(1:5), names_to = "stage", names_transform = as.factor) %>%
filter(stage %in% c("latTII","larva")) %>%
group_by(protein_id_transcript) %>%
reframe(protein_id_transcript = protein_id_transcript, human_id = human_id, stage = stage, value = value,hgnc =hgnc,
diff = value[stage == "larva"] - value[stage == "latTII"]) %>%
arrange(-diff) |>
pivot_wider(1:3, names_from = stage,values_from = value ) |>
write.table(paste0(directory_figures,"latTII-larva-DEP_up.tsv"), sep =" \t", col.names = T, row.names = F,quote = F)
# slice_head(n = 60)1.6 Protein distribution
- calculation of quantile x stage
- Below 10% from max value each protein: we consider the protein not being expressed
1.6.1 Boxplots & density
###############################
#### Quantile summary
###############################
ciona_prot.df %>%
pivot_longer(cols = 6:13, names_to = "stage", names_transform = list(stage = as.factor), values_to = "concentration") %>%
group_by(stage) %>%
mutate(
min = quantile(concentration, probs = 0),
qnt_10 = quantile(concentration, probs = 0.1),
qnt_25 = quantile(concentration, probs = 0.25),
qnt_50 = quantile(concentration, probs = 0.5),
qnt_75 = quantile(concentration, probs = 0.75),
qnt_90 = quantile(concentration, probs = 0.9),
max = quantile(concentration, probs = 1),
mean = mean(concentration),
median = median(concentration),
sd = sd(concentration)
) %>%
distinct(min, qnt_10, qnt_25, qnt_50, qnt_75, qnt_90, max, mean, median, sd)# A tibble: 8 × 11
# Groups: stage [8]
stage min qnt_10 qnt_25 qnt_50 qnt_75 qnt_90 max mean median sd
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 unfE 0.000000524 0.0276 0.0761 0.108 0.125 0.145 0.931 0.105 0.108 0.0740
2 fertE 0.000000408 0.0286 0.0741 0.104 0.121 0.139 0.611 0.0979 0.104 0.0481
3 cell16 0.00000135 0.0449 0.0928 0.117 0.130 0.146 0.914 0.108 0.117 0.0439
4 iniG 0.000000527 0.0477 0.100 0.117 0.129 0.145 0.780 0.109 0.117 0.0425
5 latN 0.000000299 0.0752 0.112 0.125 0.138 0.155 0.670 0.121 0.125 0.0397
6 midTII 0.00000795 0.0980 0.116 0.129 0.146 0.171 0.573 0.132 0.129 0.0402
7 latTII 0.0000489 0.106 0.125 0.141 0.168 0.224 0.703 0.154 0.141 0.0605
8 larva 0.0000548 0.0983 0.123 0.143 0.180 0.279 0.971 0.173 0.143 0.107
###############################
#### Protein distribution
###############################
ciona_prot.df %>%
pivot_longer(cols = 6:13, names_to = "stage", names_transform = list(stage = as.factor), values_to = "concentration") %>%
group_by(stage) %>%
mutate(stage = factor(stage, levels = rev(ciona_stages))) %>%
ggplot(aes(x = stage, y = concentration)) +
ggdist::stat_halfeye(aes(colour = stage, fill = after_scale(lighten(colour, .5))), adjust = .5, scale = 0.7, width = .75, .width = 0, justification = -.4, point_colour = NA) +
geom_point(aes(colour = stage, colour = after_scale(darken(colour, .1, space = "HLS"))), fill = "white", shape = 21, stroke = .4, size = 1.5, position = position_jitter(seed = 1, width = .12)) +
geom_boxplot(aes(colour = stage, colour = after_scale(darken(colour, .1, space = "HLS")), fill = after_scale(desaturate(lighten(colour, .8), .4))), width = .4, outlier.shape = NA) +
stat_summary(geom = "point", size = 2, fun = "mean", aes(colour = stage, colour = after_scale(darken(colour, .3, space = "HLS")))) +
coord_flip() +
scale_color_manual(values = rev(ciona_stages_palette), guide = "none") +
# scale_fill_manual(values = rev(ciona_stages_palette), guide = "none") +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 8, by = 1), expand = c(.01, 0)) +
labs(x = NULL, y = "Relative abundance") +
theme_personalised +
theme(text = element_text(size =15),
legend.position = "none",
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_text(color = rev(darken(ciona_stages_palette, .1, space = "HLS")), size = 18),
axis.title.x = element_text(margin = margin(t = 10), size = 16)
)1.6.2 Distribution per chromosome
###############################
#### Distrubtion of proteins by chromosome
###############################
# total genes per chromosome
genes.tot <- read_table("/Users/mariossi/Library/CloudStorage/Dropbox-Princeton/Papers/Proteomics_ciona/a.reference_annotation/annotation_ciona.KY21.tsv")
genesTot_chr <-
genes.tot %>%
dplyr::select(1) %>%
unique() |>
mutate(
GeneID_chr = str_remove(GeneID, "\\.[^.]+$"), # Extract everything after last .
group = gsub("*KY21.", "", GeneID_chr),
chr = gsub("*Chr", "", group)
) %>%
filter(!grepl("UAContig", chr)) %>%
dplyr::count(chr) %>%
arrange(as.numeric(chr))
# pull(n) |> sum()
# Stats by chromosome in TMTproC
prot.per.chr.df <-
ciona_prot.df %>%
group_by(protein_id_transcript) %>%
mutate(
protein_id_transcript = gsub(".v.*", "", protein_id_transcript), # Extract everything behind ".v."
chme = gsub("*KY21.", "", protein_id_transcript),
group = gsub("\\..*", "", chme),
chr = gsub("*Chr", "", group)
) %>%
filter(!grepl("UAContig", group)) %>%
group_by(chr) %>%
tally(sort = T) %>%
arrange(as.numeric(chr))
chr_freq_df <-
prot.per.chr.df %>%
left_join(genesTot_chr, by = "chr") %>%
group_by(chr) %>%
mutate(
percent = (n.x / n.y), # getting frequency
percent_long = paste0(round(percent * 100, 0), " %")
)
colourCount <- 14
getPalette <- colorRampPalette(brewer.pal(14, "Accent"))
plot_prot_per_chr <- chr_freq_df %>%
mutate(chr = factor(chr)) %>%
ggplot(aes(x = as.numeric(chr), y = percent, fill = getPalette(colourCount))) +
geom_col() +
scale_x_continuous(breaks = 1:14) +
scale_y_continuous(labels = scales::percent_format(),position = "right") +
scale_fill_manual(values = colorRampPalette(brewer.pal(8, "Accent"))(colourCount)) +
guides(fill = "none") +
labs(y = "Coverage [%]", x = "Chromosome",
# title = "Protein distribution per chromosome"
) +
# geom_text(aes(label = percent_long), color="black") +
theme_personalised +
theme(text = element_text(size =15)) +
coord_flip(xlim = c(14,1))
plot_prot_per_chrcairo_pdf("prot_per_chr.pdf", width=8.5, height=5.6)
print(plot_prot_per_chr)
invisible(dev.off())1.6.3 Protein distribution by selecting >10% max cutoff x protein
###############################
#### Quantile and 10%
###############################
prot.matrix <-
ciona_prot.df %>%
dplyr::select(1, 6:13) %>%
column_to_rownames(var = "protein_id_transcript") %>%
as.matrix()
quantiles_prot.matrix <-
as.data.frame(rowQuantiles(prot.matrix, probs = c(.10, 1), useNames = T)) %>%
rownames_to_column(var = "protein_id_transcript") %>%
rename_with(ends_with("%"), .fn = ~ paste0(., "_quantile")) %>%
mutate(`10%max.row.value` = (`100%_quantile` * 10) / 100)
ciona_quantProt.df <- ciona_prot.df %>% left_join(quantiles_prot.matrix, by = "protein_id_transcript")
quantile_prot.df <- ciona_quantProt.df %>%
group_by(protein_id_transcript) %>%
summarise(
protein_id_transcript = protein_id_transcript,
across(unfE:larva, ~ sum(. > `10%_quantile`))
)
quantile_offProt <- quantile_prot.df %>%
summarise_all(~ sum(. == 0)) %>%
mutate(state = "off")
quantile_onProt <- quantile_prot.df %>%
summarise_all(~ sum(. == 1)) %>%
mutate(state = "on")
onOffProt_10quantile.db <- bind_rows(quantile_offProt, quantile_onProt) %>%
relocate(10, 2:9) %>%
mutate(protein_id_transcript = "quantile10%")
max10_prot.df <- ciona_prot.df %>%
rowwise() |>
mutate(`10%max.row.value` = max(c_across(6:13), na.rm = TRUE) * 0.10) %>%
group_by(protein_id_transcript) %>%
summarise(
protein_id_transcript = protein_id_transcript,
across(unfE:larva, ~ sum(. > `10%max.row.value`))
)
max10_offProt <- max10_prot.df %>%
summarise_all(~ sum(. == 0)) %>%
mutate(state = "off")
max10_onProt <- max10_prot.df %>%
summarise_all(~ sum(. == 1)) %>%
mutate(state = "on")
onOffProt_10max.db <- bind_rows(max10_offProt, max10_onProt) %>%
relocate(10, 2:9) %>%
mutate(protein_id_transcript = "max10%")
# complete table
onOff_prot.db <-onOffProt_10max.db
bind_rows(onOffProt_10max.db) %>%
dplyr::rename(type = protein_id_transcript)# A tibble: 2 × 10
state unfE fertE cell16 iniG latN midTII latTII larva type
<chr> <int> <int> <int> <int> <int> <int> <int> <int> <chr>
1 off 731 752 618 614 420 187 87 90 max10%
2 on 6364 6343 6477 6481 6675 6908 7008 7005 max10%
# write.table(KY.TMTproC.df,file = "protein-quantile-rank-on.off.KY21.txt", sep='\t', quote = F, col.names=T)
###############################
# calculate the number of proteins that are stage-specific and core
###############################
# 10% max expression value is good
stage_specific_perc <-
max10_prot.df %>%
mutate(sum_count = rowSums(dplyr::select(., -protein_id_transcript))) %>%
filter(sum_count == 1) %>%
tally() # 31
stage_all_perc <-
max10_prot.df %>%
mutate(sum_count = rowSums(dplyr::select(., -protein_id_transcript))) %>%
filter(sum_count == 8) %>%
tally() # 6102 core protein
###############################
# plotting the info together
###############################
onOff_prot_pivot.db <-
onOff_prot.db %>%
pivot_longer(2:9, names_to = "stage", values_to = "counts") %>%
mutate(name = factor(stage, levels = ciona_stages))
onOff_prot_pivot.db$stage <- factor(onOff_prot_pivot.db$stage, levels = ciona_stages)
onOff_prot_pivot.db %>%
filter(state == "on") %>%
ggplot(aes(x = fct_inorder(stage), y = counts, fill = stage)) +
geom_boxplot() +
scale_fill_manual(values = ciona_stages_palette, guide = "none") +
labs(title = "Zoom protein counts", x = "Stage", y = "Count") +
theme_pubr(base_family = "Avenir", legend = "none")to_plot <- onOff_prot_pivot.db %>%
filter(state == "on") %>%
ggplot(aes(x = fct_inorder(stage), y = counts, fill = stage)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_manual(
values = ciona_stages_palette,
guide = guide_legend(title = "Type", ncol = 1)
) +
labs(title = "10% max", x = "Stage", y = "Count") +
geom_hline(yintercept = 6102, color = "black", linetype = "dashed", size = 1) +
theme_pubr(base_family = "Avenir", legend = "none")
to_plotcairo_pdf("plot_onOff_protein.pdf", width=8.5, height=5.6)
print(to_plot)
invisible(dev.off())1.6.4 Uniprot comparison
###############################
# Download file from Uniprot
###############################
'https://www.uniprot.org/proteomes/UP000008144'
# Number of entries :17,311
# AABS01000000 (17298)+ AJ517314 (14)
# Extract protein name and PE info and clean up the file
paste <(grep "^>" uniprotkb_proteome_ciona.fasta | cut -d " " -f 1 | sed 's/^>//') <(grep "^>" uniprotkb_proteome_ciona.fasta | cut -d "_" -f 2- | awk '{gsub(/ PE=/, "\tPE="); gsub(/ SV=/, "\tSV="); gsub(/ OS=/, "\tOS="); print}') > ciona_uniprot_annotation_toClean.tsv # Clean up the file ==> ciona_uniprot_annotation.tsv
# delete column CIOIN
# delete OS=Ciona intestinalis OX=7719 GN=
ciona_unipro_annotation.df <- read.table("/Users/mariossi/Library/CloudStorage/Dropbox-Princeton/Papers/Proteomics_ciona/b.raw_data/RBH/ciona_uniprot_annotation.tsv", sep="\t",header=T, quote = "") |>
as_tibble()
###############################
# Blasting Uniprot vs Ghost to get common entries
###############################
header <- c("queryId","subjectId","identity","alignmentLength","mismatch","gapopen","qstart","qend","sstart","send","evalue","bitscore")
# Load the data without headers and assign column names
blast.df <- read.table("/Users/mariossi/Library/CloudStorage/Dropbox-Princeton/Papers/Proteomics_ciona/b.raw_data/RBH/uniprot_ciona-blastResults", sep="\t", header=FALSE, col.names=header) |> as_tibble()
blastReverse.df <- read.table("/Users/mariossi/Library/CloudStorage/Dropbox-Princeton/Papers/Proteomics_ciona/b.raw_data/RBH/uniprot_ciona-blastResults_reciprocal", sep="\t", header=FALSE, col.names=header) |> as_tibble()
# define a function to find the best hit for each query
find_best_hit <- function(blast_output) {
best_hits <- blast_output %>%
group_by(queryId) %>%
slice_max(bitscore, n = 1,with_ties = FALSE)
# top_n(1, bitscore)
# dplyr::slice(which.max(bitscore))
return(best_hits)
}
# Find the best hit for each query in both BLASTp results
best_hits <- find_best_hit(blast.df)
best_hits_reciprocal <- find_best_hit(blastReverse.df) # we are interested in this output, eg all the Uniprot protein mapped to ghost eg how much Uniprot dataset is present in our data
# # Save results
# write.table(best_hits_reciprocal, "rbhh_ciona_uniprot_onlyUniprot.tsv", sep = "\t", row.names = FALSE,quote = F)
###############################
# Pie chart
###############################
# merging the uniprot and blast results and TMTc data
uniprot_piechart.df <-
ciona_unipro_annotation.df |>
left_join(best_hits_reciprocal |> dplyr::select(1:3), by = c("ID_protein"="queryId")) |>
right_join(ciona_prot.df |> dplyr::select(1,2,3,5), by = c("subjectId"="protein_id_gene")) # names mapping looks good
#
uniprot_piechart.df <- uniprot_piechart.df %>%
mutate(peptide_category = case_when(
number_of_peptides == 1 ~ "1",
number_of_peptides == 2 ~ "2",
number_of_peptides > 2 & number_of_peptides <= 10 ~ ">2",
number_of_peptides > 10 ~ ">10",
TRUE ~ "Other"
)) |>
drop_na()
uniprot_piechart.df %>%
tabyl(PE) PE n percent
PE=1 4 0.0004246735
PE=2 181 0.0192164773
PE=3 3352 0.3558764200
PE=4 5882 0.6244824291
# tally(n) # 578 not recognsised
uniprot_piechart.df %>%
tabyl(PE,peptide_category) PE >10 >2 1 2
PE=1 0 1 2 1
PE=2 40 85 34 22
PE=3 1058 1575 365 354
PE=4 1608 2590 939 745
uniprot_piechart.df %>%
group_by(PE) |>
tabyl(peptide_category) peptide_category n percent
1 1340 0.1422656
2 1122 0.1191209
>10 2706 0.2872916
>2 4251 0.4513218
uniprot_piechart.df |> tally() # uniprot protein 8958# A tibble: 1 × 1
n
<int>
1 9419
uniprot_piechart.df |> distinct(protein_id_transcript) # 6,456 protein isoform# A tibble: 6,456 × 1
protein_id_transcript
<chr>
1 KY21.Chr11.21.v1.nonSL2-1
2 KY21.Chr14.753.v1.nonSL1-1
3 KY21.Chr1.2252.v1.SL1-1
4 KY21.Chr11.212.v1.SL1-1
5 KY21.Chr11.681.v1.nonSL6-1
6 KY21.Chr1.2052.v1.SL1-1
7 KY21.Chr14.554.v3.nonSL2-1
8 KY21.Chr9.273.v1.SL3-1
9 KY21.Chr5.221.v1.nonSL9-1
10 KY21.Chr9.704.v1.SL1-1
# ℹ 6,446 more rows
uniprot_piechart.df |> distinct(subjectId) # 6,419 genes# A tibble: 6,419 × 1
subjectId
<chr>
1 KY21.Chr11.21
2 KY21.Chr14.753
3 KY21.Chr1.2252
4 KY21.Chr11.212
5 KY21.Chr11.681
6 KY21.Chr1.2052
7 KY21.Chr14.554
8 KY21.Chr9.273
9 KY21.Chr5.221
10 KY21.Chr9.704
# ℹ 6,409 more rows
# # calculate the frequency of each unique value in the "PE" column.
pe_data <- as.data.frame(table(uniprot_piechart.df$PE))
pe_data$Percentage <- round((pe_data$Freq / sum(pe_data$Freq)) * 100,1)
# donuts
pe_data$label_pos <- cumsum(pe_data$Freq) - 0.5 * pe_data$Freq
radius <- 1.2 # Adjust this value as needed to move labels in or out
pe_data$label_x <- radius * cos(pi * (pe_data$label_pos / sum(pe_data$Freq)))
pe_data$label_y <- radius * sin(pi * (pe_data$label_pos / sum(pe_data$Freq)))
a <- ggplot(pe_data, aes(x = "", y = Freq, fill = Var1)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
geom_text(aes(label = sprintf("%.1f%%", Percentage)), position = position_stack(vjust = 0.5)) +
scale_fill_manual(values = wes_palette(name = "Royal2")) +
theme_void() +
labs(fill = "PE")
color_mapping <- c("PE=1" = "#ffd584", "PE=2" = "#f9c0a0", "PE=3" = "#ff9895", "PE=4" = "#8B7700")
b <-
ggplot(pe_data, aes(x = "", y = Freq, fill = Var1)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
geom_text(aes(x = label_x, y = label_y, label = sprintf("%.1f%%", Percentage)), color = "black") +
scale_fill_manual(values = color_mapping) + # Using your custom colors here
# scale_fill_manual(values = rev(wes_palette(name = "Royal2",n=4))) +
theme_void() +
labs(fill = "PE") +
theme(text = element_text(family = "Avenir"))
a+bcairo_pdf("plot_PE_per_Category_correct.pdf", width=8.5, height=5.6)
print(b)
invisible(dev.off())
### peptides per protein
pie_data <- uniprot_piechart.df %>%
group_by(PE) %>%
summarise(total_peptides = sum(number_of_peptides))
pie_data <- uniprot_piechart.df %>%
group_by(PE, peptide_category) %>%
summarise(count = n(), .groups = 'drop')
ggplot(pie_data, aes(x = "", y = count, fill = peptide_category)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
facet_wrap(~ PE, ncol = 2) + # Adjust 'ncol' as needed
geom_text(aes(label = count), position = position_stack(vjust = 0.5)) +
theme_void() +
labs(fill = "Peptide Category", title = "Faceted Pie Chart by PE")pie_data <- uniprot_piechart.df %>%
group_by(PE, peptide_category) %>%
# summarise(frequency = n(), .groups = 'drop') %>%
summarise(count = n()) %>%
mutate(freq_percent = count / sum(count) * 100)
library(monochromeR)
palette_brown <- generate_palette("#8B7700", modification = "go_lighter", #go_darker
n_colours = 4, view_palette = TRUE)palette_brown <- generate_palette("#ff9895", modification = "go_lighter",
n_colours = 4, view_palette = TRUE)palette_brown <- generate_palette("#f9c0a0", modification = "go_lighter",
n_colours = 4, view_palette = TRUE)PE_per_Category <- ggplot(pie_data |> filter(PE != "PE=1"), aes(x = "", y = freq_percent, fill = peptide_category)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
facet_wrap(~ PE, nrow = 1) + # Adjust 'ncol' as needed
geom_text(aes(label = paste0(round(freq_percent, 1), "%")), position = position_stack(vjust = 0.5)) +
# scale_fill_manual(values = rev(palette_purple)) +
scale_fill_manual(values = palette_brown ) +
theme_void() +
theme(text=element_text(family="Avenir"),
legend.position = "bottom",
legend.direction="horizontal") +
labs(fill = "Peptide Category")
PE_per_Categorycairo_pdf("plot_PE_per_Category_brown.pdf", width=8.5, height=5.6)
cairo_pdf("plot_PE_per_Category_darkPink.pdf", width=8.5, height=5.6)
cairo_pdf("plot_PE_per_Category_lightPink.pdf", width=8.5, height=5.6)
print(PE_per_Category)
invisible(dev.off())1.6.5 Mapping TF & SM annotation
There are 2 annotation: 1) one from the ghost dataset 2) one from aniseed / Nitta et al 2019 paper
I am using a combination of both - the Nitta ppaper focus more on TF
###############################
# annotation Ghost
###############################
color_palette <- c(
"TF" = "#00A5CF",
"ZF" = "#DE1A1A",
"signalling" = "#574AE2",
"Kinases" = "#FFBF00",
"Phosphatases" = "#29BF12"
)
# Load the annotation data from an Excel file
ciona_annGhost.df <- read_excel("/Users/mariossi/Library/CloudStorage/Dropbox-Princeton/Papers/Proteomics_ciona/a.reference_annotation/annotationKY21ghost_TFs.and.signalling_february_2023.xlsx",
sheet = "all_togheter_clean",
col_names = TRUE,
trim_ws = TRUE) %>%
as_tibble()
# Vector with stages in develpmental order
ciona_stages <- c("unfE","fertE","cell16","iniG","latN","midTII","latTII","larva")
# Join with protein data
ciona_prot_annGhost.df <- ciona_annGhost.df %>%
left_join(ciona_prot.df, by = c("KY_gene_model" = "protein_id_gene"))
# Get the expressed protein data, excluding NA values in the proteome
expressed_data <- ciona_prot_annGhost.df$Type[!is.na(ciona_prot_annGhost.df$unfE)]
# Count occurrences of each factor level
level_counts <- table(expressed_data) %>%
as.data.frame.table()
# Count total expressed of each factor level
total_expressed <- table(ciona_prot_annGhost.df$Type) %>%
as.data.frame.table()
# Merge the counts and calculate frequencies
coverage_per_level <- merge(level_counts, total_expressed, by.x = "expressed_data", by.y = "Var1") %>%
mutate(Freq = Freq.x / Freq.y)
# Reorder the levels of 'expressed_data' and remove any rows with NA values
coverage_per_level <- coverage_per_level %>%
mutate(expressed_data = factor(expressed_data,
levels = rev(c("TF", "signalling", "ZF", "Phosphatases", "Kinases")))) %>%
drop_na(expressed_data)
# # vertical
ghost <-
ggplot(coverage_per_level, aes(x = Freq, y = expressed_data, color = expressed_data)) +
geom_segment(aes(x = 0, xend = Freq, y = expressed_data, yend = expressed_data), size = 1) +
geom_point(size = 4) +
scale_x_continuous(labels = percent_format(), limits = c(0, 1)) +
labs(x = "Coverage (%)", y = "") +
scale_color_manual(values = color_palette) +
theme_personalised +
theme(axis.text.y = element_text(angle = 0, hjust = 1),
legend.position = "none") +
geom_text(aes(label = Freq.x, y = expressed_data, x = 0), hjust = 1.3, color = "black", size = 3.5)
###############################
# Annotation Ghost + Nitta
###############################
# Load the annotation data from an Excel file
ciona_annGhost_Nitta.df <- read_excel("/Users/mariossi/Library/CloudStorage/Dropbox-Princeton/Papers/Proteomics_ciona/a.reference_annotation/KY21_geneAnnotation_ghost&nitta2019.xlsx",
sheet = "tf_and_signal",
col_names = TRUE,
trim_ws = TRUE) %>%
as_tibble()
# Join with protein data
ciona_prot_annGhost_Nitta.df <- ciona_annGhost_Nitta.df %>%
left_join(ciona_prot.df, by = c("KY_gene_model" = "protein_id_gene"))
# Get the expressed protein data, excluding NA values in the proteome
expressed_data <- ciona_prot_annGhost_Nitta.df$Type[!is.na(ciona_prot_annGhost_Nitta.df$unfE)]
# Count occurrences of each factor level
level_counts <- table(expressed_data) %>%
as.data.frame.table()
# Count total expressed of each factor level
total_expressed <- table(ciona_prot_annGhost_Nitta.df$Type) %>%
as.data.frame.table()
# Merge the counts and calculate frequencies
coverage_per_level <- merge(level_counts, total_expressed, by.x = "expressed_data", by.y = "Var1") %>%
mutate(Freq = Freq.x / Freq.y)
# Reorder the levels of 'expressed_data' and remove any rows with NA values
coverage_per_level <- coverage_per_level %>%
mutate(expressed_data = factor(expressed_data,
levels = rev(c("TF", "signalling", "ZF", "Phosphatases", "Kinases")))) %>%
drop_na(expressed_data)
# # vertical
ghost_nitta <-
ggplot(coverage_per_level, aes(x = Freq, y = expressed_data, color = expressed_data)) +
geom_segment(aes(x = 0, xend = Freq, y = expressed_data, yend = expressed_data), size = 1) +
geom_point(size = 4) +
scale_x_continuous(labels = percent_format(), limits = c(0, 1)) +
labs(x = "Coverage (%)", y = "") +
scale_color_manual(values = color_palette) +
theme_personalised +
theme(axis.text.y = element_text(angle = 0, hjust = 1),
legend.position = "none") +
geom_text(aes(label = Freq.x, y = expressed_data, x = 0), hjust = 1.3, color = "black", size = 3.5)
(ghost + ggtitle("ghost annotation only")) / ghost_nitta + ggtitle("ghost and Nitta et al.2019 annotation")cairo_pdf("plot_TF_SM_annotation.pdf", width=8.5, height=5.6)
print(ghost_nitta)
invisible(dev.off())